In [1]:
import numpy as np #linear algebra
import pandas as pd # data processing,CSV file I/O(e.g pd.read_csv)
import seaborn as sns # for statistical data visualization
import matplotlib.pyplot as mtp # for data visualization
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
In [2]:
data = pd.read_csv(r"C:\Users\laxma\Downloads\affairs.csv")
In [3]:
data.head()
Out[3]:
Unnamed: 0 rate_marriage age yrs_married children religious educ occupation occupation_husb affairs
0 0 3.0 32.0 9.0 3.0 3.0 17.0 2.0 5.0 0.111111
1 1 3.0 27.0 13.0 3.0 1.0 14.0 3.0 4.0 3.230769
2 2 4.0 22.0 2.5 0.0 1.0 16.0 3.0 5.0 1.400000
3 3 4.0 37.0 16.5 4.0 3.0 16.0 5.0 5.0 0.727273
4 4 5.0 27.0 9.0 1.0 1.0 14.0 3.0 4.0 4.666666
In [4]:
data.tail()
Out[4]:
Unnamed: 0 rate_marriage age yrs_married children religious educ occupation occupation_husb affairs
6361 6361 5.0 32.0 13.0 2.0 3.0 17.0 4.0 3.0 0.0
6362 6362 4.0 32.0 13.0 1.0 1.0 16.0 5.0 5.0 0.0
6363 6363 5.0 22.0 2.5 0.0 2.0 14.0 3.0 1.0 0.0
6364 6364 5.0 32.0 6.0 1.0 3.0 14.0 3.0 4.0 0.0
6365 6365 4.0 22.0 2.5 0.0 2.0 16.0 2.0 4.0 0.0
In [5]:
data.duplicated().sum()
Out[5]:
0
In [6]:
data.columns
Out[6]:
Index(['Unnamed: 0', 'rate_marriage', 'age', 'yrs_married', 'children',
       'religious', 'educ', 'occupation', 'occupation_husb', 'affairs'],
      dtype='object')
In [7]:
#VISUALIZATION
In [8]:
plt.bar(data['rate_marriage'],data['age'])
plt.xticks(rotation=90)
plt.show()
In [9]:
fig=px.bar(data,x='yrs_married',y='age',color='yrs_married')
fig.show()
In [10]:
plt.scatter(data['children'],data['rate_marriage'],color='red')
plt.xticks(rotation=90)
plt.show()
In [11]:
plt.figure(figsize=(10,4))
sns.countplot(x='educ', data=data, color='b')
plt.show()
In [12]:
plt.figure(figsize=(10,6))
top_car = data['children'].value_counts().nlargest(10)
sns.countplot(y=data.children,color='cyan')
Out[12]:
<AxesSubplot:xlabel='count', ylabel='children'>
In [13]:
sns.lineplot(x='religious', y='Unnamed: 0', data=data)
Out[13]:
<AxesSubplot:xlabel='religious', ylabel='Unnamed: 0'>
In [14]:
sns.barplot(data['occupation'],data['age'],color='r')
plt.xticks(rotation=90)
plt.show()
D:\anaconda files\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:

Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

In [15]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='yrs_married', y='educ')
plt.xlabel('yrs_married')
plt.ylabel('educ')
plt.show()
In [16]:
sns.displot(data["occupation_husb"])
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x280c52710a0>
In [17]:
sns.relplot(x='rate_marriage',y='occupation',data=data)
Out[17]:
<seaborn.axisgrid.FacetGrid at 0x280c5030640>
In [18]:
sns.countplot(x='occupation',data=data)
plt.xticks(rotation=90)
Out[18]:
(array([0, 1, 2, 3, 4, 5]),
 [Text(0, 0, '1.0'),
  Text(1, 0, '2.0'),
  Text(2, 0, '3.0'),
  Text(3, 0, '4.0'),
  Text(4, 0, '5.0'),
  Text(5, 0, '6.0')])
In [19]:
sns.boxplot(x='rate_marriage',y='affairs',data=data)
Out[19]:
<AxesSubplot:xlabel='rate_marriage', ylabel='affairs'>
In [20]:
sns.violinplot(x='occupation',y='occupation_husb',data=data)
Out[20]:
<AxesSubplot:xlabel='occupation', ylabel='occupation_husb'>
In [21]:
#MODEL BUILDING
In [22]:
x = data.iloc[:,[4,5]].values
In [23]:
import scipy.cluster.hierarchy as shc
dendro = shc.dendrogram(shc.linkage(x, method='ward'))
mtp.title('Dendrogram Plot')
mtp.ylabel('Euclidean Distance')
mtp.xlabel('Customer')
mtp.show()
In [24]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean',linkage='ward')
y_pred=hc.fit_predict(x)
In [25]:
y_pred
Out[25]:
array([0, 2, 3, ..., 3, 1, 3], dtype=int64)
In [26]:
mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0,1], s = 100, c = 'blue', label = 'Cluster 1')
mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1,1], s = 100, c = 'red', label = 'Cluster 2')
mtp.scatter(x[y_pred== 2, 0], x[y_pred == 2,1], s = 100, c = 'cyan', label = 'Cluster 3')
mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3,1], s = 100, c = 'black', label = 'Cluster 4')
mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4,1], s = 100, c = 'orange', label = 'Cluster 5')
mtp.title('cluster of yrs married')
mtp.xlabel('Children')
mtp.legend()
mtp.show()
In [ ]: